Overview

This file simulates the data I anticipate for the coordinated analysis that will be my dissertation.

metadata <- tibble(
  study = c("new_moms", "deception_detection", "karyn_diss", "murat_rep", 
             "mideast_men", "stem", "barter", "double_empathy"),
  targets = c(20, 95, 212, 200, 
              9, 59, 310, 8),
  perceivers = c(60, 95, 212, 200,
                 326, 121, 310, 100),
  videos = c(20, 95, 318, 300,
             9, 121, 155, 8),
  paradigm = c("ss", "di", "di", "di",
               "ss", "di", "di", "ss") %>% 
    factor(levels = c("ss", "di"), labels = c("Standard Stimulus", 
           "Dyadic Interaction")),
  inference_schedule = c("Variable", "Variable", "Set", "Set",
                         "Variable", "Set", "Variable", "Set") %>% 
    as.factor()
) 
vrm <- c("Disclosure", "Edification", "Advisement", "Confirmation", "Question", "Acknowledgment", "Interpretation", "Reflection")
generate_random_number <- function(mean = 8, sd = 3, min = 3, max = 19, digits = 0) {
  random_number <- NA
  while (is.na(random_number) || random_number < min || random_number > max) {
    random_number <- round(rnorm(1, mean = mean, sd = sd), digits)
  }
  return(random_number)
}

multiply_out <- function(df, n_column, column_name) {
  df_expanded <- df %>%
    rowwise() %>%
    mutate(!!column_name := list(seq_len(!!sym(n_column)))) %>%
    unnest(cols = !!sym(column_name))
  
  return(df_expanded)
}
SimulateStudy <- function(study_name, paradigm, seed = 123, n_perceivers = 1, n_videos_per_perceiver = 1){
  set.seed(seed)
  # Filter for current study
  study_data <- metadata %>% 
    filter(study == study_name) 

  # Simulate number of chapters within each video
  df = tibble(
    name = paste0(study_name, "_", 1:study_data$videos),
    n_video = 1:study_data$videos,
    n_chapter = NA
  )
  for(i in seq_len(study_data$videos)){
    df$n_chapter[i] <- generate_random_number()
  }
  df <- multiply_out(df, n_column = "n_chapter", column_name = "chapter")
 
   # Simulate number of turns within each chapter
  for(i in seq_len(study_data$videos)){
    df$n_turns[i] <- generate_random_number(mean = 11, sd = 6, 
                                           min = 4, max = 40)
  }
  df <- multiply_out(df, n_column = "n_turns", column_name = "turn")
  
  # STIMULUS LEVEL VARIABLES
  df <- df %>% 
    group_by(name, chapter) %>% 
    mutate(
      chapter_length = generate_random_number(mean = 45, sd = 6, 
                                              min = 18, max = 120,
                                              digits = 3),
      turn_length = {raw_turn_lengths <- runif(n(), min = 4, max = 40)
                     scaled_turn_lengths <- raw_turn_lengths / sum(raw_turn_lengths) *
                      chapter_length
                     round(scaled_turn_lengths, 3)
                    },
      start_time = cumsum(lag(turn_length, default = 0)),
      end_time = cumsum(turn_length),
      turns_from_inference = n() - row_number() + 1,
      turn_percent_through_chapter = (row_number() / n()) * 100,
      time_percent_through_chapter = end_time/chapter_length * 100,
      speaker = ifelse(rep(sample(c(TRUE, FALSE), 1), n()), 
                         rep(c("Partner", "Target"), length.out = n()), 
                         rep(c("Target", "Partner"), length.out = n()))  %>% 
        factor(),
      sem_sim = {
        repeat {
          base_random <- runif(n(), min = -1.00, max = 1.00)
          weight <- ifelse(speaker == "Partner",
                           ((turn_percent_through_chapter - 1) / 180)^2, 
                           ((turn_percent_through_chapter - 1) / 120)^2) 
          noise <- ifelse(speaker == "Partner",
                          rnorm(n(), mean = 0, sd = 0.3),  
                          rnorm(n(), mean = 0, sd = 0.1))  
          sem_sim_raw <- base_random * (1 - weight) + 1 * weight + noise
          if (sum(sem_sim_raw <= -0.99 | sem_sim_raw >= 0.99) / n() < 0.05) {
            break
          }
        }
       
        pmin(pmax(sem_sim_raw, -1.00), 1.00)
      },
    vrm = sample(vrm, n(), replace = TRUE)
  )
 
  
   # PARTICIPANT-LEVEL VARIABLES
  if(paradigm == "DI"){
    df <- df %>% 
      mutate(
        target = paste0(name, "_target_", n_video),
        perceiver = paste0(name, "_perceiver_", n_video),
        partner = paste0(name, "_partner_", n_video),
        paradigm = "Dyadic Interaction"
      )
  } else if (paradigm == "SS"){
    # have to double-up on the naming because nesting removes the grouping column
    df <- df %>% 
      mutate(
        name2 = name
      )
    df_list <- df %>% 
      group_by(name) %>% 
      nest() 

    out_list <- list()
    
    for(i in seq_len(n_perceivers)){
      df_i <- sample(df_list$data, n_videos_per_perceiver) %>% 
          bind_rows()
      df_i <- df_i %>% 
        mutate(
          target = paste0(name2, "_target_", n_video),
          perceiver = paste0(name2, "_perceiver_", i),
          partner = paste0(name2, "_partner_", n_video),
          paradigm = "Standard Stimulus"
      )
      out_list[[i]] <- df_i
    }
    df <- bind_rows(out_list)
    df$name <- df$name2
    df <- df %>% 
      dplyr::select(-name2)
  }
  return(df)
}
df <- list(
           stem = SimulateStudy("stem", paradigm = "DI"),
           barter = SimulateStudy("barter", paradigm = "DI"),
           deception_detection = SimulateStudy("deception_detection", paradigm = "DI"),
           new_moms = SimulateStudy("new_moms", 
                                    paradigm = "SS", 
                                    n_perceivers = 3, 
                                    n_videos_per_perceiver = 3),
           karyn_diss = SimulateStudy("karyn_diss", 
                                      paradigm = "SS", 
                                      n_perceivers = 212, 
                                      n_videos_per_perceiver = 3),
           murat_rep = SimulateStudy("karyn_diss", 
                                     paradigm = "SS", 
                                     n_perceivers = 200, 
                                     n_videos_per_perceiver = 3),
           mideast_men = SimulateStudy("mideast_men", 
                                       paradigm = "SS",
                                       n_perceivers = 326, 
                                       n_videos_per_perceiver = 4),
           double_empathy = SimulateStudy("double_empathy", 
                                          paradigm = "SS",
                                          n_perceivers = 100, 
                                          n_videos_per_perceiver = 4)
          ) %>% 
  bind_rows() %>% 
  ungroup()
## Warning: Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
df <- df %>% 
  mutate(across(where(is.character), factor))
set.seed(123)
language_mat <- c(
                   1.00,  0.50,  0.60, 0.40,  0.45,  0.55, -0.20, -0.25, -0.15,  0.30,
                   0.50,  1.00,  0.45, 0.35,  0.30,  0.40, -0.10, -0.15, -0.05,  0.25,
                   0.60,  0.45,  1.00, 0.50,  0.40,  0.35, -0.25, -0.20, -0.10,  0.35,
                   0.40,  0.35,  0.50, 1.00,  0.25,  0.30,  0.10,  0.15,  0.20,  0.10,
                   0.45,  0.30,  0.40, 0.25,  1.00,  0.50, -0.15, -0.10, -0.05,  0.40,
                   0.55,  0.40,  0.35, 0.30,  0.50,  1.00, -0.20, -0.15, -0.10,  0.30,
                  -0.20, -0.10, -0.25, 0.10, -0.15, -0.20,  1.00,  0.60,  0.50, -0.30,
                  -0.25, -0.15, -0.20, 0.15, -0.10, -0.15,  0.60,  1.00,  0.50, -0.40,
                  -0.15, -0.05, -0.10, 0.20, -0.05, -0.10,  0.50,  0.50,  1.00, -0.35,
                   0.30,  0.25,  0.35, 0.10,  0.40,  0.30, -0.30, -0.40, -0.35,  1.00)
language_noise <- rnorm(length(language_mat), mean = -.03, sd = .01)
language_mat <- language_mat + language_noise
# Independent matrix
language_vecs <- rnorm_multi(n = nrow(df),
                             mu = c(cog_processing_language = 4,
                                    memory_language = 2,
                                    certain_language = 2,
                                    self_ref_language = 10,
                                    curious_language = 1,
                                    perception_language = 2,
                                    emo_anxious = 1,
                                    emo_sad = 2,
                                    emo_anger = 0.5,
                                    emo_positive = 1.5),
                             sd = c(3, 4, 4, 2, 10, 
                                    10, 15, 15, 15, 4),
                             r =  language_mat
)

df$cog_processing_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$cog_processing_language), 
  mu = 4, 
  sd = 3, 
  r = c(0.2, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$memory_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$memory_language), 
  mu = 2, 
  sd = 4, 
  r = c(0.10, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$certain_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$certain_language), 
  mu = 2, 
  sd = 4, 
  r = c(0.12, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$self_ref_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$self_ref_language), 
  mu = 10, 
  sd = 2, 
  r = c(0.51, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$curious_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$curious_language), 
  mu = 1, 
  sd = 10, 
  r = c(0.29, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$perception_language <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$perception_language), 
  mu = 2, 
  sd = 10, 
  r = c(0.40, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_anxious <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$emo_anxious), 
  mu = 1, 
  sd = 15, 
  r = c(0.14, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_sad <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$emo_sad), 
  mu = 2, 
  sd = 15, 
  r = c(0.13, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_anger <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$emo_anger), 
  mu = 0.5, 
  sd = 15, 
  r = c(0.19, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_positive <- rnorm_pre(
  data.frame(df$sem_sim, language_vecs$emo_positive), 
  mu = 1.5, 
  sd = 4, 
  r = c(0.15, 0.8)  # Replace with the desired correlation
) %>% pmax(., 0)

Structural Hypotheses

Semsim increases closer to chapter end

avg_data <- df %>%
  group_by(turn_percent_through_chapter) %>%
  summarize(sem_sim = mean(sem_sim), .groups = "drop")

ggplot(df, aes(x = (turn_percent_through_chapter), y = sem_sim)) +
  geom_line(aes(group = perceiver), color = "gray", 
            alpha = 0.01, size = 0.5) +
  geom_hline(aes(yintercept = 0), color = "black") +
  geom_smooth(data = avg_data, aes(x = turn_percent_through_chapter, 
                                   y = sem_sim),
              method = "loess", se = FALSE, color = "black") +
  labs(
    title = "Turn Distance from Inference by Semantic Similarity",
    x = "Proximity to Inference",
    y = "Semantic Similarity",
    color = "Perceiver"
  ) +
  papaja::theme_apa(
    base_family = "Times New Roman"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

# Create explicit grouping factors
df$target_partner <- interaction(df$target, df$partner, drop = TRUE)
df$chapter_target_partner <- interaction(df$chapter, df$target_partner, drop = TRUE)

# Fit the multilevel model
model <- lmer(
  sem_sim ~ turns_from_inference + paradigm + 
    (1 | perceiver / target_partner / chapter_target_partner),
  data = df
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
# Summarize the model
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## sem_sim ~ turns_from_inference + paradigm + (1 | perceiver/target_partner/chapter_target_partner)
##    Data: df
## 
## REML criterion at convergence: 433071.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.72504 -0.76812  0.06118  0.79077  2.67138 
## 
## Random effects:
##  Groups                                            Name        Variance 
##  chapter_target_partner:(target_partner:perceiver) (Intercept) 0.0036517
##  target_partner:perceiver                          (Intercept) 0.0002721
##  perceiver                                         (Intercept) 0.0004179
##  Residual                                                      0.2303103
##  Std.Dev.
##  0.06043 
##  0.01650 
##  0.02044 
##  0.47991 
## Number of obs: 312572, groups:  
## chapter_target_partner:(target_partner:perceiver), 22340; target_partner:perceiver, 2720; perceiver, 2720
## 
## Fixed effects:
##                             Estimate Std. Error t value
## (Intercept)                0.3950263  0.0033924  116.44
## turns_from_inference      -0.0291351  0.0002044 -142.56
## paradigmStandard Stimulus -0.0518567  0.0031991  -16.21
## 
## Correlation of Fixed Effects:
##             (Intr) trns__
## trns_frm_nf -0.508       
## prdgmStndrS -0.857  0.138
## optimizer (nloptwrap) convergence code: 0 (OK)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

Semsim increases for both Target and Partner as approaches inference but increases more for target than partner

avg_data <- df %>%
  group_by(turn_percent_through_chapter, speaker) %>%
  summarize(sem_sim = mean(sem_sim), .groups = "drop")

ggplot(df, aes(x = turn_percent_through_chapter, y = sem_sim)) +
  geom_line(aes(group = perceiver, color = speaker), 
            alpha = 0.005, size = 0.5) +
  scale_color_manual(
    values = c("Partner" = "red", "Target" = "blue"),  
    name = "Speaker"
  ) +
  geom_hline(aes(yintercept = 0), color = "black") +
  # Separate average lines for Target and Partner
  geom_smooth(data = avg_data %>% filter(speaker == "Target"), 
              aes(x = turn_percent_through_chapter, y = sem_sim),
              method = "loess", se = FALSE, color = "red") +
  geom_smooth(data = avg_data %>% filter(speaker == "Partner"),
            aes(x = turn_percent_through_chapter, y = sem_sim),
              method = "loess", se = FALSE, color = "blue") +
  labs(
    title = "Turn Distance from Inference by Semantic Similarity",
    x = "Proximity to Inference",
    y = "Semantic Similarity",
    color = "Speaker"
  ) +
  papaja::theme_apa(
    base_family = "Times New Roman"
  ) +
  theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Linguistic Figures

LanguageFigsFunction <- function(df, variable, var_name = "VARIABLE", subtitle = FALSE, subtitle_text = NA){
  
  plot <- ggplot(df, aes(x = sem_sim, y = !!sym(variable))) +
      geom_point(color = "black", alpha = 0.01, size = 0.5) + 
      geom_smooth(color = "black", method = "lm", se = TRUE) +
      theme_apa(base_family = "Times New Roman") +
      labs(
        title = paste0("Correlation Between\n ", 
                       var_name, " and Semantic Similarity"),
        x = "Semantic Similarity",
        y = var_name,
        caption = paste0("Correlation = ",
                         round(cor(df["sem_sim"], df[variable]), 2))
  )  
  if (subtitle) {
    plot <- plot + labs(subtitle = subtitle_text)
  }
  return(plot)
}
LanguageFigsFunction(df, variable = "cog_processing_language", "Cognitive Processing Language",
                     subtitle = TRUE, subtitle_text = "e.g., know, think, cause")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "memory_language", "Memory Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "certain_language", "Certainty Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "self_ref_language", "Self-Referential Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "curious_language", "Curiousity Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "perception_language", "Perception Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_anxious", "Anxious Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_sad", "Sad Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_anger", "Anger Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_positive", "Positive Emotions")
## `geom_smooth()` using formula = 'y ~ x'

cormat <- df %>% 
  select(sem_sim, 
         cog_processing_language, 
         memory_language, 
         certain_language, 
         self_ref_language, 
         curious_language,
         perception_language, 
         emo_anxious, 
         emo_sad, 
         emo_anger, 
         emo_positive) %>% 
  rename("Semantic Similarity" = sem_sim,
         'Cognitive Processing' = cog_processing_language, 
         'Memory' = memory_language, 
         'Certainty' = certain_language, 
         'Self-Referential' = self_ref_language, 
         'Curiousity' = curious_language,
         'Perception'= perception_language,
         'Anxious' = emo_anxious, 
         'Sad' = emo_sad,
         'Anger' = emo_anger, 
         'Positive Emotions' = emo_positive
         ) %>% 
  cor()
cormat_melt <- melt(cormat)

ggplot(cormat_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +  # Heatmap tiles
  geom_text(aes(label = format(round(value, 2), nsmall = 2)), color = "black", size = 3) +  # Overlay correlation values
  scale_fill_gradient2(
    low = "blue", high = "red", mid = "white", 
    midpoint = 0, limit = c(-1, 1), space = "Lab",
    name = "Correlation"
  ) +
  labs(
    title = "Heatmap of Correlations",
    x = NULL,
    y = NULL
  ) +
  theme_apa() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))  # Rotate x-axis labels